Rmarkdown to pre-process scRNA seq from baseline (aka unstimated) PBMCs.

Create the Seurat Object.

.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))

library(sctransform, lib.loc = "/hpc/users/richaa21/.Rlib")
library(Seurat, lib.loc = "/hpc/users/richaa21/.Rlib")
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.2.0 but the current version is
## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 'SeuratObject' was built with package 'Matrix' 1.6.4 but the current
## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library(patchwork)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/Unstim/outs/per_sample_outs/Unstim/count/sample_filtered_feature_bc_matrix")
pbmc.unstim <- CreateSeuratObject(counts = pbmc.data, project = "pbmc_sc", min.cells = 3, min.features = 200)
pbmc.unstim
## An object of class Seurat 
## 24998 features across 21114 samples within 1 assay 
## Active assay: RNA (24998 features, 0 variable features)
##  1 layer present: counts

Lets check how many cells and features we are starting with.

length(colnames(pbmc.unstim)) #number of cells
## [1] 21114
length(rownames(pbmc.unstim)) #number of features 
## [1] 24998

Quality Control (QC)

a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT. 
b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data. 
pbmc.unstim[["percent.mt"]] <- PercentageFeatureSet(pbmc.unstim, pattern = "^MT-")

# Show QC metrics for the first 5 cells
head(pbmc.unstim@meta.data, 5)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(pbmc.unstim, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc.unstim, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 

plot2

# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells. 
pbmc.unstim <- subset(pbmc.unstim, subset = nFeature_RNA > 200 & percent.mt < 10)
head(pbmc.unstim@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527

Normalize the data.

The data is normalized based on the feature (number of genes in a cell) by the total expression. This number is multiplied by 10,000 and then log transformed. The function to do this is “NormalizeData.” The values specied below are the default values of this function.

pbmc.unstim <- NormalizeData(pbmc.unstim, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts

Identification of highly variable features

pbmc.unstim <- FindVariableFeatures(pbmc.unstim, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc.unstim), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc.unstim)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
plot1 

plot2

## Scaling the Data The data is scaled by doing a linear transformation. The ScaleData function does this by shifting the expression of genes so that the mean expression becomes 0 and the variance is 1. By default, only variable genes are scaled. This is changed by features = all.genes

##Scaling RNA data, we only scale the variable features here for efficiency
all.genes <- rownames(pbmc.unstim)
pbmc.unstim <- ScaleData(pbmc.unstim, vars.to.regress = c("percent.mt"))
## Regressing out percent.mt
## Centering and scaling data matrix

Perform linear dimensional reduction

For the first principal components, RunPCA shows the most positive (correlated) and most negative (anticorrelated) genes

pbmc.unstim <- RunPCA(pbmc.unstim, features = VariableFeatures(object = pbmc.unstim))
## PC_ 1 
## Positive:  IL7R, FP236383.3, SQSTM1, LTB, TCF7, AQP3, JAML, FP671120.4, HERPUD1, PIM2 
##     CD27, TIMP1, CCR7, TNFSF8, NCDN, CD40LG, BEX3, C1orf162, LINC00861, SNED1 
##     CCR4, IGFLR1, TRBV6-2, RCBTB2, MYC, TRAV8-3, FAAH2, WDR86-AS1, HSF4, LRRN3 
## Negative:  RRM2, MKI67, TYMS, TUBA1B, TOP2A, STMN1, UBE2C, PCLAF, CDK1, KIFC1 
##     ASF1B, NUSAP1, ZWINT, CCNA2, CENPF, ASPM, KNL1, HIST1H1B, GTSE1, TPX2 
##     SPC25, PKMYT1, TUBB, AURKB, CDCA8, CDCA2, BIRC5, CKAP2L, HJURP, GGH 
## PC_ 2 
## Positive:  KLRD1, TYROBP, CTSW, FCER1G, PRF1, GZMB, NKG7, CCL3, CCL4, KLRC1 
##     IL2RB, SH2D1B, FCGR3A, PLEK, KLRC2, NCAM1, CST7, EOMES, KIR2DL4, AOAH 
##     CCL5, TRDC, GNLY, GZMA, NCR1, FASLG, TOX, SLAMF7, CD38, KLRK1 
## Negative:  IL7R, TIMP1, S100A6, AQP3, IL32, COTL1, S100A4, SNED1, WDR86-AS1, ANP32B 
##     STAT1, IL9R, TUBA4A, ITGB1, MYC, PLK1, CCNB2, LMNA, DLGAP5, CENPA 
##     SOS1, TUBA1C, HMMR, TCF7, PLP2, CRIP1, CEP55, CENPF, CDCA3, PRDX1 
## PC_ 3 
## Positive:  GINS2, MCM2, CDC6, MCM6, MCM5, MCM7, MCM4, CDC45, E2F1, MCM3 
##     DTL, PAICS, UNG, FAM111B, FEN1, UHRF1, SLBP, HSP90AB1, CCNE2, PCNA 
##     HELLS, CHAF1B, CDCA7, SLC29A1, MCM10, CLSPN, NME1, MIF, DUT, CTNNAL1 
## Negative:  PLK1, CCNB1, KIF20A, CDC20, CENPA, NEK2, HMMR, CCNB2, DLGAP5, PSRC1 
##     ASPM, KIF14, TROAP, CENPF, CENPE, PIF1, AURKA, PIMREG, ARL6IP1, KIF2C 
##     UBALD2, PRR11, KNSTRN, GZMK, DEPDC1, CDCA8, UBE2C, KPNA2, TPX2, BIRC5 
## PC_ 4 
## Positive:  CD79A, JCHAIN, POU2AF1, IGHM, BASP1, BLNK, MEF2C, SYK, TSPAN33, TCF4 
##     RALGPS2, IGLC2, SPI1, RHEX, CD40, IGHA1, BANK1, NCF1, CDK14, CD19 
##     LINC00926, FCRL5, LINC02362, BTK, TNFRSF17, AC012236.1, IGKC, SLC43A2, RASGRP3, IGLC3 
## Negative:  S100A4, CCL5, LTB, S100A6, IL32, GZMA, PFN1, CST7, HOPX, NKG7 
##     CD8A, CD8B, ALOX5AP, GZMM, FGFBP2, THEMIS, LYAR, GZMK, ACTB, PRF1 
##     FLNA, TMSB10, GZMB, LAG3, CFL1, SAMD3, AHNAK, CXCR3, ANXA2, NCR3 
## PC_ 5 
## Positive:  LGALS1, S100A4, S100A6, IL32, ANXA2, TXN, HLA-DPA1, GAPDH, TMSB10, HLA-DRB1 
##     PFN1, CD74, ACTB, HLA-DQA1, HLA-DRA, CFL1, HOPX, PRDX1, HLA-DPB1, ACTG1 
##     ENO1, FTL, AHNAK, LINC02694, LGALS3, VIM, COTL1, JPT1, LAG3, FLNA 
## Negative:  TCF7, CCR7, TXK, PLAC8, FP236383.3, FCER1G, CXCR4, PTMA, ID3, NCAM1 
##     TYROBP, HIST1H1D, PTGDR, SELL, BACH2, C1orf162, SH2D1B, SESN3, KCNQ5, FHIT 
##     CTBP2, FAM111B, KLRF1, E2F7, RNF130, AC139720.1, EXO1, MYBL2, E2F8, ESCO2
print(pbmc.unstim[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  IL7R, FP236383.3, SQSTM1, LTB, TCF7 
## Negative:  RRM2, MKI67, TYMS, TUBA1B, TOP2A 
## PC_ 2 
## Positive:  KLRD1, TYROBP, CTSW, FCER1G, PRF1 
## Negative:  IL7R, TIMP1, S100A6, AQP3, IL32 
## PC_ 3 
## Positive:  GINS2, MCM2, CDC6, MCM6, MCM5 
## Negative:  PLK1, CCNB1, KIF20A, CDC20, CENPA 
## PC_ 4 
## Positive:  CD79A, JCHAIN, POU2AF1, IGHM, BASP1 
## Negative:  S100A4, CCL5, LTB, S100A6, IL32 
## PC_ 5 
## Positive:  LGALS1, S100A4, S100A6, IL32, ANXA2 
## Negative:  TCF7, CCR7, TXK, PLAC8, FP236383.3

The PCA results can be visualized in different ways.

VizDimLoadings(pbmc.unstim, dims = 1:2, reduction = "pca")

DimPlot(pbmc.unstim, reduction = "pca")

DimHeatmap(pbmc.unstim, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc.unstim, dims = 1:15, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset

Cells will be clustered based on PCA. How many PC to use is dependent on many factors. For example, if trying to analyze a rare cell subset, you might want to add more PCs. Usually, the first 10 is good to see dimensionality of the data.

ElbowPlot(pbmc.unstim)

Cluster the cells.

##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
pbmc.unstim <- FindNeighbors(pbmc.unstim, reduction = "pca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
pbmc.unstim <- FindClusters(pbmc.unstim, resolution = 0.5, verbose = FALSE)
head(Idents(pbmc.unstim), 5)
## AAACCTGAGAAACGCC-1 AAACCTGAGAAGGGTA-1 AAACCTGAGACCTAGG-1 AAACCTGAGATAGTCA-1 
##                 10                  2                  6                  0 
## AAACCTGAGCAAATCA-1 
##                  1 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11

Run non-linear dimensional reduction (UMAP/tSNE)

pbmc.unstim <- RunUMAP(pbmc.unstim, dims = 1:15)
## 10:15:59 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:15:59 Read 20924 rows and found 15 numeric columns
## 10:15:59 Using Annoy for neighbor search, n_neighbors = 30
## 10:15:59 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:16:01 Writing NN index file to temp file /tmp/RtmpA4NXVY/filebc682171bd0c
## 10:16:01 Searching Annoy index using 1 thread, search_k = 3000
## 10:16:08 Annoy recall = 100%
## 10:16:09 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:16:10 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:16:10 Commencing optimization for 200 epochs, with 892462 positive edges
## 10:16:36 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc.unstim, reduction = "umap", label = TRUE)

pbmc.unstim <- RunTSNE(pbmc.unstim, reduction = "pca", dims = 1:15)
DimPlot(pbmc.unstim, reduction = "tsne", label = TRUE)

Lets check our metadata now of the seurat object to see what has been added.

head(pbmc.unstim@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527
##                    RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACGCC-1              10              10
## AAACCTGAGAAGGGTA-1               2               2
## AAACCTGAGACCTAGG-1               6               6
## AAACCTGAGATAGTCA-1               0               0
## AAACCTGAGCAAATCA-1               1               1
## AAACCTGAGCAGGCTA-1               3               3
#We now have seurat clusters and RNA_snn_res.0.5 columns added. 

Finding differentially expressed features (cluster biomarkers)

# find markers for every cluster compared to all remaining cells, report only the positive
# ones
markers_unstim <- FindAllMarkers(pbmc.unstim, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11

VlnPlot will display the differential expression across the clusters. For example, I am looking here at CD8A and CD4 expression in the clusters.

VlnPlot(pbmc.unstim, features = c("CD8A", "CD4"))

The raw counts can also be shown instead by adding some parameters.

VlnPlot(pbmc.unstim, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)

FeaturePlot(pbmc.unstim, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
    "CD8A"))

Combine Seurat object with demuxlet output.

First, load and edit the .best demuxlet output file to make it more compatible.

.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))

library(tidyr)
u_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/Unstim.best", header = T, stringsAsFactors = F, check.names = F)
head(u_demuxlet)
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP
## 1 AAACCTGAGAAACGCC-1   42286   22364   20626  1854
## 2 AAACCTGAGAAGGGTA-1   35144    4231    3579  1888
## 3 AAACCTGAGACCTAGG-1    8284    1119     928   608
## 4 AAACCTGAGATAGTCA-1   31640    3692    3025  1681
## 5 AAACCTGAGCAAATCA-1   34136    4239    3322  1982
## 6 AAACCTGAGCAGGCTA-1   93780   12646   10291  4636
##                                                BEST             SNG.1ST
## 1                           SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 2 DBL-GSA8_0_NYUMD0315-01-GSA8_0_NYUMD0327-01-0.500 GSA8_0_NYUMD0315-01
## 3                           SNG-GSA8_0_NYUMD0334-01 GSA8_0_NYUMD0334-01
## 4                           SNG-GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0289-01
## 5                           SNG-GSA8_0_NYUMD0334-01 GSA8_0_NYUMD0334-01
## 6 DBL-GSA8_0_NYUMD0327-01-GSA8_0_NYUMD0334-01-0.500 GSA8_0_NYUMD0327-01
##     SNG.LLK1             SNG.2ND   SNG.LLK2   SNG.LLK0             DBL.1ST
## 1  -632.7918 GSA8_0_NYUMD0289-01 -1794.0976 -1099.8520 GSA8_0_NYUMD0289-01
## 2 -1267.1772 GSA8_0_NYUMD0327-01 -1376.1579 -1201.1205 GSA8_0_NYUMD0315-01
## 3  -220.6593 GSA8_0_NYUMD0327-01  -499.5366  -330.9340 GSA8_0_NYUMD0327-01
## 4  -601.2027 GSA8_0_NYUMD0327-01 -1580.6534  -975.7141 GSA8_0_NYUMD0289-01
## 5  -695.9992 GSA8_0_NYUMD0315-01 -1791.4493 -1106.4335 GSA8_0_NYUMD0315-01
## 6 -2584.2428 GSA8_0_NYUMD0334-01 -3897.3665 -2888.3866 GSA8_0_NYUMD0327-01
##               DBL.2ND ALPHA      LLK12       LLK1       LLK2      LLK10
## 1 GSA8_0_NYUMD0315-01   0.5 -1024.5246 -1794.0976  -632.7918 -1840.7324
## 2 GSA8_0_NYUMD0327-01   0.5  -885.9914 -1267.1772 -1376.1579 -1211.0138
## 3 GSA8_0_NYUMD0334-01   0.5  -301.3090  -499.5366  -220.6593  -458.4461
## 4 GSA8_0_NYUMD0315-01   0.5  -900.3046  -601.2027 -1603.4487  -610.6523
## 5 GSA8_0_NYUMD0334-01   0.5  -992.4663 -1791.4493  -695.9992 -1561.3775
## 6 GSA8_0_NYUMD0334-01   0.5 -2274.3541 -2584.2428 -3897.3665 -2729.2507
##        LLK20      LLK00   PRB.DBL PRB.SNG1
## 1 -1024.5246 -1155.0510 2.49e-171        1
## 2 -1279.7057 -1088.3092  1.00e+00        1
## 3  -313.0930  -338.7091  3.14e-36        1
## 4  -900.3046  -996.4076 4.23e-131        1
## 5 -1019.5818 -1123.0042 5.87e-130        1
## 6 -3366.3864 -2648.3299  1.00e+00        1

To edit: I will split the Best column into multiple columns.

u_demuxlet_edit = u_demuxlet %>% 
  mutate(BEST = gsub("-01","", BEST)) %>%
  separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
  separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
  separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
  select(-contains("garbage"))
## Warning: Expected 3 pieces. Additional pieces discarded in 3082 rows [2, 6, 8,
## 11, 15, 23, 24, 29, 30, 31, 34, 42, 54, 58, 59, 60, 68, 75, 76, 83, ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 18152 rows [1,
## 3, 4, 5, 7, 9, 10, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, ...].
head(u_demuxlet_edit)
##              BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
## 1 AAACCTGAGAAACGCC-1   42286   22364   20626  1854                       SNG
## 2 AAACCTGAGAAGGGTA-1   35144    4231    3579  1888                       DBL
## 3 AAACCTGAGACCTAGG-1    8284    1119     928   608                       SNG
## 4 AAACCTGAGATAGTCA-1   31640    3692    3025  1681                       SNG
## 5 AAACCTGAGCAAATCA-1   34136    4239    3322  1982                       SNG
## 6 AAACCTGAGCAGGCTA-1   93780   12646   10291  4636                       DBL
##   DMX_maxID DMX_secondID             SNG.1ST   SNG.LLK1             SNG.2ND
## 1 NYUMD0315         <NA> GSA8_0_NYUMD0315-01  -632.7918 GSA8_0_NYUMD0289-01
## 2 NYUMD0315    NYUMD0327 GSA8_0_NYUMD0315-01 -1267.1772 GSA8_0_NYUMD0327-01
## 3 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -220.6593 GSA8_0_NYUMD0327-01
## 4 NYUMD0289         <NA> GSA8_0_NYUMD0289-01  -601.2027 GSA8_0_NYUMD0327-01
## 5 NYUMD0334         <NA> GSA8_0_NYUMD0334-01  -695.9992 GSA8_0_NYUMD0315-01
## 6 NYUMD0327    NYUMD0334 GSA8_0_NYUMD0327-01 -2584.2428 GSA8_0_NYUMD0334-01
##     SNG.LLK2   SNG.LLK0             DBL.1ST             DBL.2ND ALPHA
## 1 -1794.0976 -1099.8520 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01   0.5
## 2 -1376.1579 -1201.1205 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01   0.5
## 3  -499.5366  -330.9340 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5
## 4 -1580.6534  -975.7141 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01   0.5
## 5 -1791.4493 -1106.4335 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01   0.5
## 6 -3897.3665 -2888.3866 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01   0.5
##        LLK12       LLK1       LLK2      LLK10      LLK20      LLK00   PRB.DBL
## 1 -1024.5246 -1794.0976  -632.7918 -1840.7324 -1024.5246 -1155.0510 2.49e-171
## 2  -885.9914 -1267.1772 -1376.1579 -1211.0138 -1279.7057 -1088.3092  1.00e+00
## 3  -301.3090  -499.5366  -220.6593  -458.4461  -313.0930  -338.7091  3.14e-36
## 4  -900.3046  -601.2027 -1603.4487  -610.6523  -900.3046  -996.4076 4.23e-131
## 5  -992.4663 -1791.4493  -695.9992 -1561.3775 -1019.5818 -1123.0042 5.87e-130
## 6 -2274.3541 -2584.2428 -3897.3665 -2729.2507 -3366.3864 -2648.3299  1.00e+00
##   PRB.SNG1
## 1        1
## 2        1
## 3        1
## 4        1
## 5        1
## 6        1
table(u_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
## 
##   AMB   DBL   SNG 
##    25  3057 18152
table(u_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
## 
## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334 
##      4904      5661      5475      5194
table(u_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
##                          DMX_maxID
## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
##                       AMB         4         5         5        11
##                       DBL      1301      1131       608        17
##                       SNG      3599      4525      4862      5166

Next, I will add this edited demuxlet to the Seurat object.

u_demuxlet_edit.subset <- u_demuxlet_edit[u_demuxlet_edit$BARCODE %in% colnames(pbmc.unstim),]

pbmc.unstim@meta.data <- cbind(pbmc.unstim@meta.data,u_demuxlet_edit.subset$DMX_maxID,u_demuxlet_edit.subset$DMX_classification.global, u_demuxlet_edit.subset$BARCODE)

head(pbmc.unstim@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527
##                    RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGAAACGCC-1              10              10
## AAACCTGAGAAGGGTA-1               2               2
## AAACCTGAGACCTAGG-1               6               6
## AAACCTGAGATAGTCA-1               0               0
## AAACCTGAGCAAATCA-1               1               1
## AAACCTGAGCAGGCTA-1               3               3
##                    u_demuxlet_edit.subset$DMX_maxID
## AAACCTGAGAAACGCC-1                        NYUMD0315
## AAACCTGAGAAGGGTA-1                        NYUMD0315
## AAACCTGAGACCTAGG-1                        NYUMD0334
## AAACCTGAGATAGTCA-1                        NYUMD0289
## AAACCTGAGCAAATCA-1                        NYUMD0334
## AAACCTGAGCAGGCTA-1                        NYUMD0327
##                    u_demuxlet_edit.subset$DMX_classification.global
## AAACCTGAGAAACGCC-1                                              SNG
## AAACCTGAGAAGGGTA-1                                              DBL
## AAACCTGAGACCTAGG-1                                              SNG
## AAACCTGAGATAGTCA-1                                              SNG
## AAACCTGAGCAAATCA-1                                              SNG
## AAACCTGAGCAGGCTA-1                                              DBL
##                    u_demuxlet_edit.subset$BARCODE
## AAACCTGAGAAACGCC-1             AAACCTGAGAAACGCC-1
## AAACCTGAGAAGGGTA-1             AAACCTGAGAAGGGTA-1
## AAACCTGAGACCTAGG-1             AAACCTGAGACCTAGG-1
## AAACCTGAGATAGTCA-1             AAACCTGAGATAGTCA-1
## AAACCTGAGCAAATCA-1             AAACCTGAGCAAATCA-1
## AAACCTGAGCAGGCTA-1             AAACCTGAGCAGGCTA-1

Pre-Processing the new object.

The Seurat Object has already been preprocessed in my case, so this should be clean)

pbmc.unstim[["percent.mt"]] <- PercentageFeatureSet(pbmc.unstim, pattern = "^MT-")

library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
# look at distribution of metrics by classification
plot_grid(VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "u_demuxlet_edit.subset$DMX_maxID"))

VlnPlot(pbmc.unstim, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "u_demuxlet_edit.subset$DMX_classification.global")

Now, select for MT content under 10% and for nfeatureRNA > 200 to ensure getting alive cells. I previously did this, so should see no change in cell numnbers.

pbmc.unstim <- subset(pbmc.unstim, subset = nFeature_RNA > 200 & percent.mt < 10)
length(colnames(pbmc.unstim))
## [1] 20924
length(rownames(pbmc.unstim))
## [1] 24998

Add column to meta data to identify seurat object as Basline condition. Also rename some columns for clarity purposes.

pbmc.unstim@meta.data$condition <- 'Baseline'
names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
names(pbmc.unstim@meta.data)[names(pbmc.unstim@meta.data) == "u_demuxlet_edit.subset$BARCODE"] <- "Barcode"
head(pbmc.unstim@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAAACGCC-1    pbmc_sc      16199         3729   1.524785
## AAACCTGAGAAGGGTA-1    pbmc_sc      12545         4071   4.161020
## AAACCTGAGACCTAGG-1    pbmc_sc       3125         1519   3.392000
## AAACCTGAGATAGTCA-1    pbmc_sc      10924         3523   4.137679
## AAACCTGAGCAAATCA-1    pbmc_sc      10657         3697   6.615370
## AAACCTGAGCAGGCTA-1    pbmc_sc      32387         6522   4.421527
##                    RNA_snn_res.0.5 seurat_clusters DMX_maxID
## AAACCTGAGAAACGCC-1              10              10 NYUMD0315
## AAACCTGAGAAGGGTA-1               2               2 NYUMD0315
## AAACCTGAGACCTAGG-1               6               6 NYUMD0334
## AAACCTGAGATAGTCA-1               0               0 NYUMD0289
## AAACCTGAGCAAATCA-1               1               1 NYUMD0334
## AAACCTGAGCAGGCTA-1               3               3 NYUMD0327
##                    DMX_classification.global            Barcode condition
## AAACCTGAGAAACGCC-1                       SNG AAACCTGAGAAACGCC-1  Baseline
## AAACCTGAGAAGGGTA-1                       DBL AAACCTGAGAAGGGTA-1  Baseline
## AAACCTGAGACCTAGG-1                       SNG AAACCTGAGACCTAGG-1  Baseline
## AAACCTGAGATAGTCA-1                       SNG AAACCTGAGATAGTCA-1  Baseline
## AAACCTGAGCAAATCA-1                       SNG AAACCTGAGCAAATCA-1  Baseline
## AAACCTGAGCAGGCTA-1                       DBL AAACCTGAGCAGGCTA-1  Baseline
saveRDS(pbmc.unstim, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.unstim.final.RDS")